# Load packages
# Data Preparation
library(dplyr) #data manipulation
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr) # read rectangular data like csv, tsv
library(skimr) # provide broad overview of dataframe
library(xts) # handling time-based data
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
library(lubridate) # easier to work with dates & time
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(tidyr) # tidy messy data
# Data Visualization
library(ggplot2)
library(plotly) # produce interactive plots
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(ggcorrplot) # create corrlation matrix using ggplot2
library(ggpubr) # easier to use functions working with ggplot2
library(viridis) # access to viridis palette
## Loading required package: viridisLite
library(gridExtra) # arrange multiple grid-based plots on a page
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(ggthemes) # create additional themes
library(corrplot) # create correlation plot
## corrplot 0.89 loaded
library(naniar) # display #s of missing values visually
##
## Attaching package: 'naniar'
## The following object is masked from 'package:skimr':
##
## n_complete
# Data Wrangling
library(varhandle) # to unfactor factor
library(Amelia) # visualizing missing values
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.0, built: 2021-05-26)
## ## Copyright (C) 2005-2022 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(VIM) # display missing proportion plot
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
# Time Series
library(imputeTS) # use if for imputating missing values for ts object
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
##
## Attaching package: 'imputeTS'
## The following object is masked from 'package:zoo':
##
## na.locf
library(tictoc) # timing function
library(MetricsWeighted) # performance measures used in machine learning
library(forecast)
##
## Attaching package: 'forecast'
## The following object is masked from 'package:MetricsWeighted':
##
## accuracy
## The following object is masked from 'package:ggpubr':
##
## gghistogram
Summary
# Clear the global env variables
rm(list=ls())
# Set seed so code can be reproduced
set.seed(2021)
# Read the train dataset and create desired data types
train <- read_csv("C:/Users/chris/OneDrive/Documents/Projects/Project_5_Forecasting/walmart_sales/train_dataset.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## Store = col_double(),
## Dept = col_double(),
## Date = col_date(format = ""),
## Weekly_Sales = col_double(),
## IsHoliday = col_logical()
## )
# View the first 10 observations
head(train, n = 10)
## # A tibble: 10 x 5
## Store Dept Date Weekly_Sales IsHoliday
## <dbl> <dbl> <date> <dbl> <lgl>
## 1 1 1 2010-02-05 24924. FALSE
## 2 1 1 2010-02-12 46039. TRUE
## 3 1 1 2010-02-19 41596. FALSE
## 4 1 1 2010-02-26 19404. FALSE
## 5 1 1 2010-03-05 21828. FALSE
## 6 1 1 2010-03-12 21043. FALSE
## 7 1 1 2010-03-19 22137. FALSE
## 8 1 1 2010-03-26 26229. FALSE
## 9 1 1 2010-04-02 57258. FALSE
## 10 1 1 2010-04-09 42961. FALSE
# Take a look at the structure of the train data set
# skim(train)
Summary:
Walmart provided us with 4 data sets. This is the train data set, which has 5 predictors. Within the data set, you will find the data set spans from 2012/02/05 to 2012/11/01. Some information about each predictor:
# Read the stores data set and create desired data types
stores <- read_csv("C:/Users/chris/OneDrive/Documents/Projects/Project_5_Forecasting/walmart_sales/stores_dataset.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## Store = col_double(),
## Type = col_character(),
## Size = col_double()
## )
# View the first 10 observations
head(stores, n = 10)
## # A tibble: 10 x 3
## Store Type Size
## <dbl> <chr> <dbl>
## 1 1 A 151315
## 2 2 A 202307
## 3 3 B 37392
## 4 4 A 205863
## 5 5 B 34875
## 6 6 A 202505
## 7 7 B 70713
## 8 8 A 155078
## 9 9 B 125833
## 10 10 B 126512
# Take a look at the structure of the stores data set
# skim(stores)
Summary:
In the stores data set, there are 3 predictors. This data set contains information on the Size and Type columns of about 45 stores. Some information about the 3 predictors:
# Read the features data set and create desired data types
features <- read_csv("C:/Users/chris/OneDrive/Documents/Projects/Project_5_Forecasting/walmart_sales/features_dataset.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## Store = col_double(),
## Date = col_date(format = ""),
## Temperature = col_double(),
## Fuel_Price = col_double(),
## MarkDown1 = col_double(),
## MarkDown2 = col_double(),
## MarkDown3 = col_double(),
## MarkDown4 = col_double(),
## MarkDown5 = col_double(),
## CPI = col_double(),
## Unemployment = col_double(),
## IsHoliday = col_logical()
## )
# View the first 10 observations
head(features, n = 10)
## # A tibble: 10 x 12
## Store Date Temperature Fuel_Price MarkDown1 MarkDown2 MarkDown3
## <dbl> <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 2010-02-05 42.3 2.57 NA NA NA
## 2 1 2010-02-12 38.5 2.55 NA NA NA
## 3 1 2010-02-19 39.9 2.51 NA NA NA
## 4 1 2010-02-26 46.6 2.56 NA NA NA
## 5 1 2010-03-05 46.5 2.62 NA NA NA
## 6 1 2010-03-12 57.8 2.67 NA NA NA
## 7 1 2010-03-19 54.6 2.72 NA NA NA
## 8 1 2010-03-26 51.4 2.73 NA NA NA
## 9 1 2010-04-02 62.3 2.72 NA NA NA
## 10 1 2010-04-09 65.9 2.77 NA NA NA
## # ... with 5 more variables: MarkDown4 <dbl>, MarkDown5 <dbl>, CPI <dbl>,
## # Unemployment <dbl>, IsHoliday <lgl>
# Take a look at the structure of the features data set
# skim(features)
Summary:
In this data set, there are 12 predictors. Some new information on some of the predictors that aren’t found in the stores & train data sets:
# Read the test data set and create desired data types
test <- read_csv("C:/Users/chris/OneDrive/Documents/Projects/Project_5_Forecasting/walmart_sales/test_dataset.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## Store = col_double(),
## Dept = col_double(),
## Date = col_date(format = ""),
## IsHoliday = col_logical()
## )
# View the first 10 observations
head(test, n = 10)
## # A tibble: 10 x 4
## Store Dept Date IsHoliday
## <dbl> <dbl> <date> <lgl>
## 1 1 1 2012-11-02 FALSE
## 2 1 1 2012-11-09 FALSE
## 3 1 1 2012-11-16 FALSE
## 4 1 1 2012-11-23 TRUE
## 5 1 1 2012-11-30 FALSE
## 6 1 1 2012-12-07 FALSE
## 7 1 1 2012-12-14 FALSE
## 8 1 1 2012-12-21 FALSE
## 9 1 1 2012-12-28 TRUE
## 10 1 1 2013-01-04 FALSE
# Take a look at the structure of the test data set
# skim(test)
Summary:
This is the test data set, which has 4 predictors. Within the data set, you will find the data set spans from 2012-11-02 to 2013-07-26. The test data set is pretty similar to the train data set, except it doesn’t has the response variable - the Weekly_Sales.
# Merge train & features data set and saved into newly created df
df <- merge(train, features)
# Merge stores data set with the df variable
train_df <- merge(df, stores, by = "Store")
# View the first 10 observations
head(train_df, n = 10)
## Store Date IsHoliday Dept Weekly_Sales Temperature Fuel_Price
## 1 1 2010-02-05 FALSE 1 24924.50 42.31 2.572
## 2 1 2010-02-05 FALSE 26 11737.12 42.31 2.572
## 3 1 2010-02-05 FALSE 17 13223.76 42.31 2.572
## 4 1 2010-02-05 FALSE 45 37.44 42.31 2.572
## 5 1 2010-02-05 FALSE 28 1085.29 42.31 2.572
## 6 1 2010-02-05 FALSE 79 46729.77 42.31 2.572
## 7 1 2010-02-05 FALSE 55 21249.31 42.31 2.572
## 8 1 2010-02-05 FALSE 5 32229.38 42.31 2.572
## 9 1 2010-02-05 FALSE 58 7659.97 42.31 2.572
## 10 1 2010-02-05 FALSE 7 21084.08 42.31 2.572
## MarkDown1 MarkDown2 MarkDown3 MarkDown4 MarkDown5 CPI Unemployment Type
## 1 NA NA NA NA NA 211.0964 8.106 A
## 2 NA NA NA NA NA 211.0964 8.106 A
## 3 NA NA NA NA NA 211.0964 8.106 A
## 4 NA NA NA NA NA 211.0964 8.106 A
## 5 NA NA NA NA NA 211.0964 8.106 A
## 6 NA NA NA NA NA 211.0964 8.106 A
## 7 NA NA NA NA NA 211.0964 8.106 A
## 8 NA NA NA NA NA 211.0964 8.106 A
## 9 NA NA NA NA NA 211.0964 8.106 A
## 10 NA NA NA NA NA 211.0964 8.106 A
## Size
## 1 151315
## 2 151315
## 3 151315
## 4 151315
## 5 151315
## 6 151315
## 7 151315
## 8 151315
## 9 151315
## 10 151315
Summary:
# Split into year, month, week, day
train_df <- train_df %>%
dplyr::mutate(Year = lubridate::year(train_df$Date), # using mutate() function from dplyr package
Month = lubridate::month(train_df$Date),
Week = lubridate::week(train_df$Date),
Day = lubridate::day(train_df$Date)
)
head(train_df)
## Store Date IsHoliday Dept Weekly_Sales Temperature Fuel_Price MarkDown1
## 1 1 2010-02-05 FALSE 1 24924.50 42.31 2.572 NA
## 2 1 2010-02-05 FALSE 26 11737.12 42.31 2.572 NA
## 3 1 2010-02-05 FALSE 17 13223.76 42.31 2.572 NA
## 4 1 2010-02-05 FALSE 45 37.44 42.31 2.572 NA
## 5 1 2010-02-05 FALSE 28 1085.29 42.31 2.572 NA
## 6 1 2010-02-05 FALSE 79 46729.77 42.31 2.572 NA
## MarkDown2 MarkDown3 MarkDown4 MarkDown5 CPI Unemployment Type Size
## 1 NA NA NA NA 211.0964 8.106 A 151315
## 2 NA NA NA NA 211.0964 8.106 A 151315
## 3 NA NA NA NA 211.0964 8.106 A 151315
## 4 NA NA NA NA 211.0964 8.106 A 151315
## 5 NA NA NA NA 211.0964 8.106 A 151315
## 6 NA NA NA NA 211.0964 8.106 A 151315
## Year Month Week Day
## 1 2010 2 6 5
## 2 2010 2 6 5
## 3 2010 2 6 5
## 4 2010 2 6 5
## 5 2010 2 6 5
## 6 2010 2 6 5
Summary:
# Create a function to combine store & dept and form an ID
unique_dept_store <- function(data){
mutate(data, Store_Dept = paste0(Store, "_", Dept),
.before = 1)
}
# Apply the function to both data sets - train & test
train_forecast <- unique_dept_store((train_df))
test_forecast <- unique_dept_store((test))
head(train_forecast)
## Store_Dept Store Date IsHoliday Dept Weekly_Sales Temperature
## 1 1_1 1 2010-02-05 FALSE 1 24924.50 42.31
## 2 1_26 1 2010-02-05 FALSE 26 11737.12 42.31
## 3 1_17 1 2010-02-05 FALSE 17 13223.76 42.31
## 4 1_45 1 2010-02-05 FALSE 45 37.44 42.31
## 5 1_28 1 2010-02-05 FALSE 28 1085.29 42.31
## 6 1_79 1 2010-02-05 FALSE 79 46729.77 42.31
## Fuel_Price MarkDown1 MarkDown2 MarkDown3 MarkDown4 MarkDown5 CPI
## 1 2.572 NA NA NA NA NA 211.0964
## 2 2.572 NA NA NA NA NA 211.0964
## 3 2.572 NA NA NA NA NA 211.0964
## 4 2.572 NA NA NA NA NA 211.0964
## 5 2.572 NA NA NA NA NA 211.0964
## 6 2.572 NA NA NA NA NA 211.0964
## Unemployment Type Size Year Month Week Day
## 1 8.106 A 151315 2010 2 6 5
## 2 8.106 A 151315 2010 2 6 5
## 3 8.106 A 151315 2010 2 6 5
## 4 8.106 A 151315 2010 2 6 5
## 5 8.106 A 151315 2010 2 6 5
## 6 8.106 A 151315 2010 2 6 5
head(test_forecast)
## # A tibble: 6 x 5
## Store_Dept Store Dept Date IsHoliday
## <chr> <dbl> <dbl> <date> <lgl>
## 1 1_1 1 1 2012-11-02 FALSE
## 2 1_1 1 1 2012-11-09 FALSE
## 3 1_1 1 1 2012-11-16 FALSE
## 4 1_1 1 1 2012-11-23 TRUE
## 5 1_1 1 1 2012-11-30 FALSE
## 6 1_1 1 1 2012-12-07 FALSE
Summary:
# re-order the dataset to have the response variable - weekly sales to the last column
col_df <- c("Date", "Year","Month", "Week", "Day", "Store_Dept", "Dept", "Store", "Type", "Size", "IsHoliday", "Temperature", "Fuel_Price", "CPI", "Unemployment", "MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5", "Weekly_Sales")
# Assign the order of the columns into the train_df
train_forecast <- train_forecast[, col_df]
head(train_forecast)
## Date Year Month Week Day Store_Dept Dept Store Type Size IsHoliday
## 1 2010-02-05 2010 2 6 5 1_1 1 1 A 151315 FALSE
## 2 2010-02-05 2010 2 6 5 1_26 26 1 A 151315 FALSE
## 3 2010-02-05 2010 2 6 5 1_17 17 1 A 151315 FALSE
## 4 2010-02-05 2010 2 6 5 1_45 45 1 A 151315 FALSE
## 5 2010-02-05 2010 2 6 5 1_28 28 1 A 151315 FALSE
## 6 2010-02-05 2010 2 6 5 1_79 79 1 A 151315 FALSE
## Temperature Fuel_Price CPI Unemployment MarkDown1 MarkDown2 MarkDown3
## 1 42.31 2.572 211.0964 8.106 NA NA NA
## 2 42.31 2.572 211.0964 8.106 NA NA NA
## 3 42.31 2.572 211.0964 8.106 NA NA NA
## 4 42.31 2.572 211.0964 8.106 NA NA NA
## 5 42.31 2.572 211.0964 8.106 NA NA NA
## 6 42.31 2.572 211.0964 8.106 NA NA NA
## MarkDown4 MarkDown5 Weekly_Sales
## 1 NA NA 24924.50
## 2 NA NA 11737.12
## 3 NA NA 13223.76
## 4 NA NA 37.44
## 5 NA NA 1085.29
## 6 NA NA 46729.77
# Display the %s of missing values for each variable in a plot
gg_miss_var(train_forecast, show_pct = TRUE)
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
Summary:
Let’s perform further analysis into the data set to decide whether or not we should utilize data imputation methods to estimate the missing values.
# Create a new df for visualization only
forecasting_vis <- train_df
# Take the average weekly sales and arrange from largest to smallest
d1 <- forecasting_vis %>%
group_by(Dept, Year) %>%
summarise(avg_weeklysales = mean(Weekly_Sales)) %>%
arrange(desc(avg_weeklysales))
## `summarise()` has grouped output by 'Dept'. You can override using the `.groups` argument.
d1
## # A tibble: 243 x 3
## # Groups: Dept [81]
## Dept Year avg_weeklysales
## <dbl> <dbl> <dbl>
## 1 92 2012 78361.
## 2 92 2011 75417.
## 3 92 2010 72147.
## 4 95 2012 71262.
## 5 95 2010 69379.
## 6 95 2011 69047.
## 7 38 2012 62533.
## 8 38 2011 61223.
## 9 38 2010 59655.
## 10 72 2010 54707.
## # ... with 233 more rows
# Create a bar plot for Dept vs Weekly Sales
b1 <- ggplot(data = d1, aes(x = Dept,y = avg_weeklysales, color = "red")) +
geom_col() +
facet_wrap(~Year) +
labs(x = "Department", y = "Average Weekly Sales") +
ggtitle("Relationship of Dept vs Weekly Sales")
# Create a scatter plot for Dept vs Weekly Sales
s1 <- ggplot(d1, aes(x=Dept, y=avg_weeklysales)) + geom_point() +
scale_colour_hue(l=50) + # Use a slightly darker palette than normal
geom_smooth(method=lm, # Add linear regression lines
se=TRUE, # Don't add shaded confidence region
fullrange=TRUE) + # Extend regression lines
labs(x = "Department", y = "Average Weekly Sales")
# Organized the plots in one page
grid.arrange(b1, s1, nrow=2)
## `geom_smooth()` using formula 'y ~ x'
cor(forecasting_vis$Dept,forecasting_vis$Weekly_Sales)
## [1] 0.1480321
Summary:
bar plot:department #92 has the highest average weekly sales for 2010 - 2012 that are around 70k vs department #47 in 2010 and 2011 with a lowest(negative) average sales
scatter plot:this plot showcases the linearity of x-variable Dept vs y-variable Avg Weekly Sales, we can see the linear line is more straight which indicate it has a bare minimal relationship
# Take the average weekly sales by dept and arrange from desc to asce
# For 2010 Only
top_dept_2010 <- forecasting_vis %>%
group_by(Dept) %>%
filter(Year == "2010") %>%
summarise(Top_Avg_Wklysales = mean(Weekly_Sales)) %>%
# rename(Top_Avg_Wklysales = Freq) %>%
arrange(desc(Top_Avg_Wklysales))
top_dept_2010
## # A tibble: 81 x 2
## Dept Top_Avg_Wklysales
## <dbl> <dbl>
## 1 92 72147.
## 2 95 69379.
## 3 38 59655.
## 4 72 54707.
## 5 65 49096.
## 6 40 44637.
## 7 2 43543.
## 8 90 43243.
## 9 94 33717.
## 10 91 33067.
## # ... with 71 more rows
# Create a plot to display top ten highest weekly sales by dept - 2010
p_2010 <- head(top_dept_2010, n = 10) %>%
ggplot(aes(x = reorder(as.factor(Dept),
Top_Avg_Wklysales),
y = Top_Avg_Wklysales,
fill=as.factor(Dept))) +
geom_bar(stat = 'identity') +
theme(legend.position = "none")+
labs(y = "Total Weekly Sales", x = 'Departments', title = "Top Ten Departments with the Highest Average Weekly Sales - 2010") +
coord_flip()
# For 2011 Only
top_dept_2011 <- forecasting_vis %>%
group_by(Dept) %>%
filter(Year == "2011") %>%
summarise(Top_Avg_Wklysales = mean(Weekly_Sales)) %>%
# rename(Top_Avg_Wklysales = Freq) %>%
arrange(desc(Top_Avg_Wklysales))
top_dept_2011
## # A tibble: 81 x 2
## Dept Top_Avg_Wklysales
## <dbl> <dbl>
## 1 92 75417.
## 2 95 69047.
## 3 38 61223.
## 4 72 52184.
## 5 90 46132.
## 6 40 44155.
## 7 2 43378.
## 8 65 41301.
## 9 94 33959.
## 10 91 33704.
## # ... with 71 more rows
# Create a plot to display top ten highest weekly sales by dept - 2011
p_2011 <- head(top_dept_2011, n = 10) %>%
ggplot(aes(x = reorder(as.factor(Dept),
Top_Avg_Wklysales),
y = Top_Avg_Wklysales,
fill=as.factor(Dept))) +
geom_bar(stat = 'identity') +
theme(legend.position = "none")+
labs(y = "Total Weekly Sales", x = 'Departments', title = "Top Ten Departments with the Highest Average Weekly Sales - 2011") +
coord_flip()
# For 2012 Only
top_dept_2012 <- forecasting_vis %>%
group_by(Dept) %>%
filter(Year == "2012") %>%
summarise(Top_Avg_Wklysales = mean(Weekly_Sales)) %>%
# rename(Top_Avg_Wklysales = Freq) %>%
arrange(desc(Top_Avg_Wklysales))
top_dept_2012
## # A tibble: 81 x 2
## Dept Top_Avg_Wklysales
## <dbl> <dbl>
## 1 92 78361.
## 2 95 71262.
## 3 38 62533.
## 4 65 46370.
## 5 90 46365.
## 6 40 46098.
## 7 72 44028.
## 8 2 43955.
## 9 91 34361.
## 10 94 32461.
## # ... with 71 more rows
# Create a plot to display top ten highest weekly sales by dept - 2011
p_2012 <- head(top_dept_2012, n = 10) %>%
ggplot(aes(x = reorder(as.factor(Dept),
Top_Avg_Wklysales),
y = Top_Avg_Wklysales,
fill=as.factor(Dept))) +
geom_bar(stat = 'identity') +
theme(legend.position = "none")+
labs(y = "Total Weekly Sales", x = 'Departments', title = "Top Ten Departments with the Highest Average Weekly Sales - 2011") +
coord_flip()
# Create a subplot
fig <- subplot(p_2010, p_2011, p_2012)%>%
layout(title = list(text = "Top Ten Departments with the Highest Average Weekly Sales in 2010 - 2012"),
plot_bgcolor='#e5ecf6',
xaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'),
yaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'))
fig
Summary:
the number one dept in the top ten depts rank for all three years is #92, compared to the bottom one in the top ten depts rank is #91 in 2010-2011, and #94 in 2012, #91 moves up one rank in 2012
surprisingly, there are no depts fall off the top ten depts rank in these three years, and no additional depts climbed up in the top ten depts rank
# Take the average weekly sales and arrange from largest to smallest
d1 <- forecasting_vis %>%
group_by(Temperature, Year) %>%
summarise(avg_weeklysales = mean(Weekly_Sales)) %>%
arrange(desc(avg_weeklysales))
## `summarise()` has grouped output by 'Temperature'. You can override using the `.groups` argument.
# View(d1)
# Create a line plot for Temperature vs Weekly Sales
b2 <- ggplot(data = d1, aes(x = Temperature,y = avg_weeklysales, color = "red")) +
geom_line(color = "black") +
geom_smooth(method = "loess", color = "red", span = 1/5) +
ggtitle("Relationship of Temperature vs Weekly Sales") +
facet_wrap(~Year)
# Create a scatter plot for Temperature vs Weekly Sales
s2 <- ggplot(d1, aes(x = Temperature, y = avg_weeklysales)) + geom_point() +
scale_colour_hue(l=50) + # Use a slightly darker palette than normal
geom_smooth(method = lm, # Add linear regression lines
se = TRUE, # Don't add shaded confidence region
fullrange = TRUE) # Extend regression lines
# Organized the plots in one page
grid.arrange(b2, s2, nrow = 2)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
cor(forecasting_vis$Temperature,forecasting_vis$Weekly_Sales)
## [1] -0.002312447
Summary:
line plot: the temperature at25.17 in 2010 has the highest average weekly sales:51k vs the temperature at37.74 in 2011 has the lowest average weekly sales:4k
bar plot: shows the relationship of temperature vs average weekly sales, the line is flat which indicates there’s barely any relationship which is proved by a correlation of-0.0023
# Take the average weekly sales and arrange from largest to smallest
d1 <- forecasting_vis %>%
group_by(Fuel_Price, Year) %>%
summarise(avg_weeklysales = mean(Weekly_Sales)) %>%
arrange(desc(avg_weeklysales))
## `summarise()` has grouped output by 'Fuel_Price'. You can override using the `.groups` argument.
# view(d1)
# Create a line plot for Fuel_Price vs Weekly Sales
b3 <- ggplot(data = d1, aes(x = Fuel_Price,y = avg_weeklysales, color = "red")) +
geom_line(color = "black") +
geom_smooth(method = "loess", color = "red", span = 1/5) +
ggtitle("Relationship of Fuel_Price vs Weekly Sales")
facet_wrap(~Year)
## <ggproto object: Class FacetWrap, Facet, gg>
## compute_layout: function
## draw_back: function
## draw_front: function
## draw_labels: function
## draw_panels: function
## finish_data: function
## init_scales: function
## map_data: function
## params: list
## setup_data: function
## setup_params: function
## shrink: TRUE
## train_scales: function
## vars: function
## super: <ggproto object: Class FacetWrap, Facet, gg>
# Create a scatter plot for Fuel_Price vs Weekly Sales
s3 <- ggplot(d1, aes(x=Fuel_Price, y=avg_weeklysales)) + geom_point() +
scale_colour_hue(l=50) + # Use a slightly darker palette than normal
geom_smooth(method=lm, # Add linear regression lines
se=TRUE, # Don't add shaded confidence region
fullrange=TRUE) # Extend regression lines
# Organized the plots in one page
grid.arrange(b3, s3, nrow=2)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
cor(forecasting_vis$Fuel_Price,forecasting_vis$Weekly_Sales)
## [1] -0.0001202955
Summary:
line plot: the fuel price at$3.10 in2011 has the highest average weekly sales:36k vs the fuel price at$3.61 in2012 has the lowest average weekly sales: 5k
bar plot: shows the relationship of fuel price vs average weekly sales, the line is flat which indicates there’s barely any relationship which is proved by a correlation of-0.00012
# Create a bar plot for MarkDown1 vs Weekly Sales
b4 <- forecasting_vis %>%
group_by(MarkDown1, Year) %>%
summarise(avg_weeklysales = mean(Weekly_Sales)) %>%
arrange(desc(avg_weeklysales)) %>%
ggplot(aes(x = MarkDown1,y = avg_weeklysales, color = "red")) +
geom_col() +
facet_wrap(~Year) +
ggtitle("MarkDown1 vs Weekly Sales")
## `summarise()` has grouped output by 'MarkDown1'. You can override using the `.groups` argument.
# Create a bar plot for MarkDown2 vs Weekly Sales
b5 <- forecasting_vis %>%
group_by(MarkDown2, Year) %>%
summarise(avg_weeklysales = mean(Weekly_Sales)) %>%
arrange(desc(avg_weeklysales)) %>%
ggplot(aes(x = MarkDown2,y = avg_weeklysales, color = "red")) +
geom_col() +
facet_wrap(~Year) +
ggtitle("MarkDown2 vs Weekly Sales")
## `summarise()` has grouped output by 'MarkDown2'. You can override using the `.groups` argument.
# Create a bar plot for MarkDown3 vs Weekly Sales
b6 <- forecasting_vis %>%
group_by(MarkDown3, Year) %>%
summarise(avg_weeklysales = mean(Weekly_Sales)) %>%
arrange(desc(avg_weeklysales)) %>%
ggplot(aes(x = MarkDown3,y = avg_weeklysales, color = "red")) +
geom_col() +
facet_wrap(~Year) +
ggtitle("MarkDown3 vs Weekly Sales")
## `summarise()` has grouped output by 'MarkDown3'. You can override using the `.groups` argument.
# Create a bar plot for MarkDown4 vs Weekly Sales
b7 <- forecasting_vis %>%
group_by(MarkDown4, Year) %>%
summarise(avg_weeklysales = mean(Weekly_Sales)) %>%
arrange(desc(avg_weeklysales)) %>%
ggplot(aes(x = MarkDown4,y = avg_weeklysales, color = "red")) +
geom_col() +
facet_wrap(~Year) +
ggtitle("MarkDown4 vs Weekly Sales")
## `summarise()` has grouped output by 'MarkDown4'. You can override using the `.groups` argument.
# Create a bar plot for MarkDown5 vs Weekly Sales
b8 <- forecasting_vis %>%
group_by(MarkDown5, Year) %>%
summarise(avg_weeklysales = mean(Weekly_Sales)) %>%
arrange(desc(avg_weeklysales)) %>%
ggplot(aes(x = MarkDown5,y = avg_weeklysales, color = "red")) +
geom_col() +
facet_wrap(~Year) +
ggtitle("MarkDown5 vs Weekly Sales")
## `summarise()` has grouped output by 'MarkDown5'. You can override using the `.groups` argument.
# Organized the plots in one page
grid.arrange(b4, b5, b6, nrow=3)
## Warning: Removed 3 rows containing missing values (position_stack).
## Warning: Removed 3 rows containing missing values (position_stack).
## Warning: Removed 3 rows containing missing values (position_stack).
grid.arrange(b7, b8, nrow=2)
## Warning: Removed 3 rows containing missing values (position_stack).
## Warning: Removed 2 rows containing missing values (position_stack).
Summary:
since MarkDown1 to MarkDown5 contain anonymized information, as we can see above there aren’t much we can draw from these plots besides knowing the lowest and highest range
All MarkDowns 1 to 5 have missing values in 2010
MarkDown1 ranges from 0.27 to 88k
MarkDown2 ranges from a negative 265.8 to 104k
MarkDown3 ranges from a negative 29.10 to 141k
MarkDown4 ranges from 0.22 to 67k
MarkDown5 ranges from 135.2 to 108k
# Take the average weekly sales and arrange from largest to smallest
d1 <- forecasting_vis %>%
group_by(CPI, Year) %>%
summarise(avg_weeklysales = mean(Weekly_Sales)) %>%
arrange(desc(avg_weeklysales))
## `summarise()` has grouped output by 'CPI'. You can override using the `.groups` argument.
d1
## # A tibble: 2,145 x 3
## # Groups: CPI [2,145]
## CPI Year avg_weeklysales
## <dbl> <dbl> <dbl>
## 1 183. 2010 39579.
## 2 213. 2010 37883.
## 3 221. 2011 36731.
## 4 205. 2010 36158.
## 5 189. 2011 34444.
## 6 212. 2011 33652.
## 7 137. 2010 33268.
## 8 211. 2010 33166.
## 9 213. 2010 32392.
## 10 219. 2011 30678.
## # ... with 2,135 more rows
# Create a bar plot for CPI vs Weekly Sales
b9 <- ggplot(data = d1, aes(x = CPI,y = avg_weeklysales, color = "red")) +
geom_col() +
ggtitle("Relationship of CPI vs Weekly Sales") +
facet_wrap(~Year)
# Create a scatter plot for CPI vs Weekly Sales
s9 <- ggplot(d1, aes(x=CPI, y=avg_weeklysales)) + geom_point() +
scale_colour_hue(l=50) + # Use a slightly darker palette than normal
geom_smooth(method=lm, # Add linear regression lines
se=TRUE, # Don't add shaded confidence region
fullrange=TRUE) # Extend regression lines
# Organized the plots in one page
grid.arrange(b9, s9, nrow=2)
## `geom_smooth()` using formula 'y ~ x'
cor(forecasting_vis$CPI,forecasting_vis$Weekly_Sales)
## [1] -0.02092134
Summary:
bar plot: CPI 182.55~ in 2010 has the highest average weekly sales: 39k compared to CPI 132.11~ in 2010 has the lowest average weekly sales: 15k
scatter plot: this plot showcases the linearity of x-variable CPI vs y-variable Avg Weekly Sales, we can see the linear line is more straight, which indicate it has a bare minimal relationship and it is confirmed through a correlation of -0.021
# Take the average weekly sales and arrange from largest to smallest
d1 <- forecasting_vis %>%
group_by(Unemployment, Year) %>%
summarise(avg_weeklysales = mean(Weekly_Sales)) %>%
arrange(desc(avg_weeklysales))
## `summarise()` has grouped output by 'Unemployment'. You can override using the `.groups` argument.
d1
## # A tibble: 354 x 3
## # Groups: Unemployment [349]
## Unemployment Year avg_weeklysales
## <dbl> <dbl> <dbl>
## 1 5.14 2011 33559.
## 2 6.39 2011 31117.
## 3 7.13 2010 30828.
## 4 4.31 2012 30432.
## 5 3.88 2012 29929.
## 6 4.61 2012 29872.
## 7 4.08 2012 29634.
## 8 7.80 2010 29482.
## 9 5.64 2011 29092.
## 10 5.96 2012 28551.
## # ... with 344 more rows
# Create a bar plot for Unemployment vs Weekly Sales
b10 <- ggplot(data = d1, aes(x = Unemployment,y = avg_weeklysales, color = "red")) +
geom_col() +
ggtitle("Relationship of Unemployment vs Weekly Sales") +
facet_wrap(~Year)
# Create a scatter plot for Unemployment vs Weekly Sales
s10 <- ggplot(d1, aes(x=Unemployment, y=avg_weeklysales)) + geom_point() +
scale_colour_hue(l=50) + # Use a slightly darker palette than normal
geom_smooth(method=lm, # Add linear regression lines
se=TRUE, # Don't add shaded confidence region
fullrange=TRUE) # Extend regression lines
# Organized the plots in one page
grid.arrange(b10, s10, nrow=2)
## `geom_smooth()` using formula 'y ~ x'
cor(forecasting_vis$Unemployment,forecasting_vis$Weekly_Sales)
## [1] -0.02586372
Summary:
bar plot: unemployment at 5.14% in 2011 has the highest average weekly sales: 33k compared to unemployment at 6.57~ in 2010 has the lowest average weekly sales: 4k
scatter plot: this plot showcases the linearity of x-variable unemployment vs y-variable Avg Weekly Sales, we can see the linear line is more straight, which indicate it has a bare minimal relationship and it is confirmed through a correlation of -0.025
# Take the average weekly sales and arrange from largest to smallest
d1 <- forecasting_vis %>%
group_by(Size, Year) %>%
summarise(avg_weeklysales = mean(Weekly_Sales)) %>%
arrange(desc(avg_weeklysales))
## `summarise()` has grouped output by 'Size'. You can override using the `.groups` argument.
d1
## # A tibble: 120 x 3
## # Groups: Size [40]
## Size Year avg_weeklysales
## <dbl> <dbl> <dbl>
## 1 200898 2010 31257.
## 2 205863 2012 29975.
## 3 205863 2011 29831.
## 4 203742 2010 29790.
## 5 203742 2011 29463.
## 6 203742 2012 29250.
## 7 200898 2011 29115.
## 8 202307 2010 27794.
## 9 205863 2010 27709.
## 10 219622 2012 27631.
## # ... with 110 more rows
# Create a bar plot for Size vs Weekly Sales
b11 <- ggplot(data = d1, aes(x = Size,y = avg_weeklysales, color = "red")) +
geom_col() +
ggtitle("Relationship of Size vs Weekly Sales") +
facet_wrap(~Year)
# Create a scatter plot for Size vs Weekly Sales
s11 <- ggplot(d1, aes(x=Size, y=avg_weeklysales)) + geom_point() +
scale_colour_hue(l=50) + # Use a slightly darker palette than normal
geom_smooth(method=lm, # Add linear regression lines
se=TRUE, # Don't add shaded confidence region
fullrange=TRUE) # Extend regression lines
# Organized the plots in one page
grid.arrange(b11, s11, nrow=2)
## `geom_smooth()` using formula 'y ~ x'
cor(forecasting_vis$Size,forecasting_vis$Weekly_Sales)
## [1] 0.243828
Summary:
bar plot: the store size at 200k in 2010 has the highest average weekly sales: 31k compared to store size at 34k in 2010 has the lowest average weekly sales: 4k
scatter plot: this plot showcases the linearity of x-variable store size vs y-variable Avg Weekly Sales, we can see the linear line is going upward, which indicate it has a positive linearity relationship and it is confirmed through a correlation of +0.24
# Calculate the average of weekly sales for each week for 2010
avg_sales_2010 <- forecasting_vis %>%
group_by(Store) %>%
filter(Year == "2010") %>%
summarise(avg_weeklysales = mean(Weekly_Sales)) %>%
arrange(desc(avg_weeklysales)) %>%
mutate(across(where(is.numeric), ~ round(., 2))) # round to 2 dec. places
avg_sales_2010
## # A tibble: 45 x 2
## Store avg_weeklysales
## <dbl> <dbl>
## 1 14 31257.
## 2 20 29790.
## 3 2 27794.
## 4 4 27709.
## 5 10 26984.
## 6 13 26982.
## 7 27 26320.
## 8 6 22555.
## 9 1 21283.
## 10 19 21253.
## # ... with 35 more rows
# Create a plot to display top ten highest weekly sales by stores - 2010
s_2010 <- head(avg_sales_2010, n = 10) %>%
ggplot(aes(x = reorder(as.factor(Store),
avg_weeklysales),
y = avg_weeklysales,
fill=as.factor(Store))) +
geom_bar(stat = 'identity') +
theme(legend.position = "none")+
labs(y = "Total Average Sales", x = 'Stores', title = "Top Ten Stores with the Highest Weekly Sales - 2010") +
coord_flip()
# Calculate the average of weekly sales for each week for 2011
avg_sales_2011 <- forecasting_vis %>%
group_by(Year, Store) %>%
filter(Year == "2011") %>%
summarise(avg_weeklysales = mean(Weekly_Sales)) %>%
arrange(desc(avg_weeklysales)) %>%
mutate(across(where(is.numeric), ~ round(., 2))) # round to 2 dec. places
## `summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.
avg_sales_2011
## # A tibble: 45 x 3
## # Groups: Year [1]
## Year Store avg_weeklysales
## <dbl> <dbl> <dbl>
## 1 2011 4 29831.
## 2 2011 20 29463.
## 3 2011 14 29115.
## 4 2011 13 27474.
## 5 2011 2 26444.
## 6 2011 10 26399.
## 7 2011 27 24657.
## 8 2011 1 21718.
## 9 2011 6 21607.
## 10 2011 39 21102.
## # ... with 35 more rows
# Create a plot to display top ten highest weekly sales by stores - 2011
s_2011 <- head(avg_sales_2011, n = 10) %>%
ggplot(aes(x = reorder(as.factor(Store),
avg_weeklysales),
y = avg_weeklysales,
fill=as.factor(Store))) +
geom_bar(stat = 'identity') +
theme(legend.position = "none")+
labs(y = "Total Average Sales", x = 'Stores', title = "Top Ten Stores with the Highest Weekly Sales - 2011") +
coord_flip()
# Calculate the average of weekly sales for each week for 2010
avg_sales_2012 <- forecasting_vis %>%
group_by(Year, Store) %>%
filter(Year == "2012") %>%
summarise(avg_weeklysales = mean(Weekly_Sales)) %>%
arrange(desc(avg_weeklysales)) %>%
mutate(across(where(is.numeric), ~ round(., 2))) # round to 2 dec. places
## `summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.
avg_sales_2012
## # A tibble: 45 x 3
## # Groups: Year [1]
## Year Store avg_weeklysales
## <dbl> <dbl> <dbl>
## 1 2012 4 29975.
## 2 2012 20 29250.
## 3 2012 13 27631.
## 4 2012 2 26451.
## 5 2012 14 25626.
## 6 2012 10 25507.
## 7 2012 27 23373.
## 8 2012 39 22229.
## 9 2012 1 22180.
## 10 2012 6 21573.
## # ... with 35 more rows
# Create a plot to display top ten highest weekly sales by stores - 2011
s_2012 <- head(avg_sales_2012, n = 10) %>%
ggplot(aes(x = reorder(as.factor(Store),
avg_weeklysales),
y = avg_weeklysales,
fill=as.factor(Store))) +
geom_bar(stat = 'identity') +
theme(legend.position = "none")+
labs(y = "Total Average Weekly Sales", x = 'Stores', title = "Top Ten Stores with the Highest Average Weekly Sales - 2012") +
coord_flip()
fig <- subplot(s_2010, s_2011, s_2012)%>%
layout(title = list(text = "Top Ten Stores with the Highest Average Weekly Sales in 2010 - 2012"),
plot_bgcolor='#e5ecf6',
xaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'),
yaxis = list(
zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'))
fig
Summary:
the number one store in the top ten stores rank #14 in 2010 is, #4 in 20111-2012, as compared to the bottom one in the top ten stores rank is #19 in 2010, #39 in 2011, and #6 in 2012
surprisingly, #19 store drops in the top ten stores rank in 2011 to 2012, and #39 climbed up the top ten stores rank
# Group and calc. the percentage of the number of proportion in types A to C
stores_pct <- stores %>%
group_by(Type) %>%
summarize(counts = n(),
percentage = n()/nrow(stores)) %>%
mutate(across(where(is.numeric), ~ round(., 2))) # round to 2 dec. places
stores_pct
## # A tibble: 3 x 3
## Type counts percentage
## <chr> <dbl> <dbl>
## 1 A 22 0.49
## 2 B 17 0.38
## 3 C 6 0.13
# Create a pie chart with plotly
store_pie <- plot_ly(data = stores_pct, labels = ~Type, values = ~percentage,
type = 'pie',
sort= FALSE,
marker= list(colors = colors, line = list(color="black", width = 1))) %>%
layout(title = "Types of Stores Proportion ")
store_pie
Summary:
# Group and calc. the percentage of the number of proportion in types A to C
isholiday_pct <- train %>%
group_by(IsHoliday) %>%
summarize(counts = n())
isholiday_pct
## # A tibble: 2 x 2
## IsHoliday counts
## <lgl> <int>
## 1 FALSE 391909
## 2 TRUE 29661
# Create a pie chart with plotly
isholiday_pie <- plot_ly(data = isholiday_pct, labels = ~IsHoliday, values = ~counts,
type = 'pie',
sort= FALSE,
marker= list(colors = colors,
line = list(color="black", width = 1))) %>%
layout(title = "Proportion of Holiday vs Non-Holiday")
isholiday_pie
# Create a bar graph of Yearly & Monthly Total Weekly Sales for 2010 to 2012
df1 <- forecasting_vis %>%
mutate(month = factor(Month)) %>%
mutate(year = factor(Year)) %>%
mutate(wkly_sales = Weekly_Sales)
p1 <- ggplot(df1, aes(x = month, y = wkly_sales, fill = year)) +
geom_col(position = "dodge") +
labs(x = "Month", y = "Total Weekly Sales", title = "Total Monthly Weekly Sales in 2010 - 2012")
p1
Summary:
# Drop gender & geography as factors cant be calculated to get its correlation
corr_matrix <- forecasting_vis %>%
dplyr:: select(where(is.numeric)) %>%
dplyr::select(-c("MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5"))
# Create a correlation matrix
corr <- round(cor(corr_matrix), 2)
corrplot(corr, method = "color", cl.pos = 'n', rect.col = "black", tl.col = "indianred4", addCoef.col = "black", number.digits = 2, number.cex = 0.60, tl.cex = 0.7, cl.cex = 1, col = colorRampPalette(c("green4","white","red"))(100))
Summary:
# Create a new train df for modeling
# train_forecast <- train_df
#
# # Create a new test df for modeling
# test_forecast <- test_df
# Check to see how many unique observations there are in train data set & test data set
length(unique(train_forecast$Store_Dept))
## [1] 3331
length(unique(test_forecast$Store_Dept))
## [1] 3169
# Filter out the 11 observations that are in the test data set
# train_df <- filter(train_df, storeDept %in% unique(test_df$storeDept))
train_obs <- filter(train_forecast, Store_Dept %in% unique(test_forecast$Store_Dept))
# Subtract out the 11 more observations found in the test data set
# length(unique(test_forecast$Store_Dept)) - length(unique(train_forecast$Store_Dept))
length(unique(test_forecast$Store_Dept)) - length(unique(train_forecast$Store_Dept))
## [1] -162
# Find the 11 numbers of Dept_Store that are not found in the train data set
eleven_dept_store_nums <-
test_forecast %>%
filter(!Store_Dept %in% unique(train_obs$Store_Dept)) %>%
.$Store_Dept %>%
unique()
eleven_dept_store_nums
## [1] "5_99" "9_99" "10_99" "18_43" "24_43" "25_99" "34_39" "36_30" "37_29"
## [10] "42_30" "45_39"
Summary:
# Check if the data has irregular time series (missing gaps between observations)
# Add 1 because the first week is not accounted for in the difference
# Check to see the beginning & ending of dates in the train data set
begin_train <- min(train_forecast$Date)
end_train <- max(train_forecast$Date)
# Check to see the beginning & ending of dates in the test data set
begin_test <- min(test_forecast$Date)
end_test <- max(test_forecast$Date)
# Calculate the #s of weeks between the ending & beginning dates for train & test data sets
nums_wks_train <- difftime(end_train, begin_train, units = "weeks") + 1
nums_wks_train
## Time difference of 143 weeks
nums_wks_test <- difftime(end_test, begin_test, units = "weeks") + 1
nums_wks_test
## Time difference of 39 weeks
# Retrieve numbers of observations in each store_dept in the train data set
nums_obs_dept_store <-
train_forecast %>%
count(Store_Dept) %>%
arrange(n) %>%
rename(Num_Obs = n)
unique(nums_obs_dept_store$Num_Obs)
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## [19] 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## [37] 37 38 39 40 41 42 43 44 45 46 48 49 50 51 52 53 54 56
## [55] 57 58 59 60 61 62 64 65 66 67 68 69 70 71 74 75 76 77
## [73] 78 79 81 82 83 84 85 86 87 88 89 90 91 92 93 95 96 97
## [91] 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
## [109] 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
## [127] 134 135 136 137 138 139 140 141 142 143
# Identify the dept_store that has 143 obs
numObs_vs_weeklySales <- train_forecast %>%
merge(nums_obs_dept_store, by = "Store_Dept") %>%
dplyr::select(Date, Store_Dept, Weekly_Sales, Num_Obs)
Summary:
# Filter only holiday weeks
holiday_wks <-
train_forecast %>%
filter(IsHoliday == TRUE) %>%
.$Date %>%
unique()
holiday_wks
## [1] "2010-02-12" "2010-09-10" "2010-11-26" "2010-12-31" "2011-02-11"
## [6] "2011-09-09" "2011-11-25" "2011-12-30" "2012-02-10" "2012-09-07"
# Calculate the weeks before the identified holiday weeks
before_holiday_wks <- holiday_wks - 7
before_holiday_wks
## [1] "2010-02-05" "2010-09-03" "2010-11-19" "2010-12-24" "2011-02-04"
## [6] "2011-09-02" "2011-11-18" "2011-12-23" "2012-02-03" "2012-08-31"
# Convert irregular ts to regular ts
# Create a tibble and then add in the missing gaps of the ts through outer joining with the dates of the 143 wks
train_dates <- tibble("Date" = seq(begin_train, end_train, by = 7))
convert_regular_ts <- function(data){
Store_Dept <- unique(data$Store_Dept)
Store <- unique(data$Store)
Dept <- unique(data$Dept)
merge(data, train_dates, by = "Date", all = T) %>%
replace_na(list(Store_Dept = Store_Dept,
Store = Store,
Dept = Dept #,
))
}
store_dept_df <-
train_forecast %>%
dplyr::select(Store_Dept, Store, Dept, Date, Weekly_Sales) %>%
group_by(Store_Dept) %>%
do(convert_regular_ts(.)) %>%
ungroup() %>%
arrange(Store, Dept)
head(store_dept_df)
## # A tibble: 6 x 5
## Date Store_Dept Store Dept Weekly_Sales
## <date> <chr> <dbl> <dbl> <dbl>
## 1 2010-02-05 1_1 1 1 24924.
## 2 2010-02-12 1_1 1 1 46039.
## 3 2010-02-19 1_1 1 1 41596.
## 4 2010-02-26 1_1 1 1 19404.
## 5 2010-03-05 1_1 1 1 21828.
## 6 2010-03-12 1_1 1 1 21043.
Summary:
I also calculated the weeks before each holiday date then, I created a tibble to add in the missing gaps of the time series by joining the dates of the 143 weeks
# Convert into multiple time series object and use pivot_wider to spread the mts into separate columns
store_dept_mts<-
store_dept_df %>%
dplyr::select(-Store, -Dept) %>%
pivot_wider(names_from = Store_Dept, values_from = Weekly_Sales) %>%
dplyr::select(-Date) %>%
ts(start = decimal_date(begin_train), frequency = 52)
store_dept_mts[, 1]
## Time Series:
## Start = 2010.09589041096
## End = 2012.82665964173
## Frequency = 52
## [1] 24924.50 46039.49 41595.55 19403.54 21827.90 21043.39 22136.64 26229.21
## [9] 57258.43 42960.91 17596.96 16145.35 16555.11 17413.94 18926.74 14773.04
## [17] 15580.43 17558.09 16637.62 16216.27 16328.72 16333.14 17688.76 17150.84
## [25] 15360.45 15381.82 17508.41 15536.40 15740.13 15793.87 16241.78 18194.74
## [33] 19354.23 18122.52 20094.19 23388.03 26978.34 25543.04 38640.93 34238.88
## [41] 19549.39 19552.84 18820.29 22517.56 31497.65 44912.86 55931.23 19124.58
## [49] 15984.24 17359.70 17341.47 18461.18 21665.76 37887.17 46845.87 19363.83
## [57] 20327.61 21280.40 20334.23 20881.10 20398.09 23873.79 28762.37 50510.31
## [65] 41512.39 20138.19 17235.15 15136.78 15741.60 16434.15 15883.52 14978.09
## [73] 15682.81 15363.50 16148.87 15654.85 15766.60 15922.41 15295.55 14539.79
## [81] 14689.24 14537.37 15277.27 17746.68 18535.48 17859.30 18337.68 20797.58
## [89] 23077.55 23351.80 31579.90 39886.06 18689.54 19050.66 20911.25 25293.49
## [97] 33305.92 45773.03 46788.75 23350.88 16567.69 16894.40 18365.10 18378.16
## [105] 23510.49 36988.49 54060.10 20124.22 20113.03 21140.07 22366.88 22107.70
## [113] 28952.86 57592.12 34684.21 16976.19 16347.60 17147.44 18164.20 18517.79
## [121] 16963.55 16065.49 17666.00 17558.82 16633.41 15722.82 17823.37 16566.18
## [129] 16348.06 15731.18 16628.31 16119.92 17330.70 16286.40 16680.24 18322.37
## [137] 19616.22 19251.50 18947.81 21904.47 22764.01 24185.27 27390.81
Summary:
# Perform interpolation to fill in missing values
impute <- function(current_ts){
if(sum(!is.na(ts)) >= 3){
na_seadec(current_ts)
} else if(sum(!is.na(ts)) == 2){
na_interpolation(current_ts)
} else{
na_locf(current_ts)
}
}
for(i in 1:ncol(store_dept_mts)){
store_dept_mts[, i] <- impute(store_dept_mts[, i])
}
sum(is.na(store_dept_mts))
## [1] 0
Summary:
# Determine which are holiday vs non-holiday
holiday_non_holiday <- train_forecast %>%
dplyr::select(Date, IsHoliday) %>%
unique() %>%
.$IsHoliday
# Count the #s of rows in the mts train
total_data <- nrow(store_dept_mts)
# Split the data into 80% to train & 20% to validate
train_set <- round(0.80 * total_data)
validate_set <- total_data - train_set
validate_weights <- holiday_non_holiday[(total_data - validate_set + 1):total_data]
train_data <- store_dept_mts %>% subset(end = train_set)
validate_data <- store_dept_mts %>% subset(start = train_set + 1)
Summary:
NOTE: the test data set will be untouched until I arrive at a final model based on the lowest WMAE score
# Define a function to calculate the WMAE
WMAE <- function(fc){
# rep() to replicate weights for each storeDept
weights <- as.vector(rep(validate_weights, ncol(fc)))
# as.vector() collapse all columns into one
MetricsWeighted::mae(as.vector(validate_data), as.vector(fc), weights)
}
Summary:
NOTE: This is also the metric that Walmart specifically points out in the guideline to use.
# Define a function to generates forecast for each time series forecasting method
model_forecasts <- function(train_data, h, model, ...){
tic()
# Initialize forecasts with zeroes
full_forecast <- matrix(0, h, ncol(train_data))
# Iterate through all storeDept to perform forecasting
for(i in 1:ncol(train_data)){
current_ts <- train_data[, i]
fc <- model(current_ts, h, ...)
full_forecast[, i] <- fc
}
toc()
# Return forecasts
full_forecast
}
Summary:
# change index for different storeDept
basic_ts <- store_dept_mts[, 111]
basic_train_mod <- basic_ts %>% subset(end = 107)
# Create a plot function to autoplot each time series forecasting method
forecast_plots <- function(ref, fc_list, model_names){
plt <- autoplot(ref)
for(i in 1:length(fc_list)){
plt <- plt + autolayer(fc_list[[i]], series = model_names[i], PI = F)
}
plt <- plt +
ylab("Weekly_Sales") +
guides(color = guide_legend(title = "Time Series Forecasting Method:"))
plt
}
Summary:
# Using the forecast function to generate weekly sales projection
simple_es <- function(current_ts, h){
ses(current_ts, h = h)$mean
}
# Produce the summary of the simple ES
simple_es_mod <- ses(basic_train_mod, h = 36)
summary(simple_es_mod)
##
## Forecast method: Simple exponential smoothing
##
## Model Information:
## Simple exponential smoothing
##
## Call:
## ses(y = basic_train_mod, h = 36)
##
## Smoothing parameters:
## alpha = 0.5233
##
## Initial states:
## l = 3722.5512
##
## sigma: 757.2014
##
## AIC AICc BIC
## 1922.714 1922.947 1930.733
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 13.64275 750.0913 559.8444 -2.596425 15.21913 0.8879636 0.08546371
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2012.154 4486.434 3516.041 5456.827 3002.34662 5970.521
## 2012.173 4486.434 3391.209 5581.659 2811.43202 6161.436
## 2012.192 4486.434 3279.217 5693.651 2640.15452 6332.714
## 2012.211 4486.434 3176.766 5796.102 2483.47015 6489.398
## 2012.231 4486.434 3081.768 5891.100 2338.18343 6634.685
## 2012.250 4486.434 2992.800 5980.068 2202.11861 6770.749
## 2012.269 4486.434 2908.842 6064.026 2073.71498 6899.153
## 2012.288 4486.434 2829.131 6143.737 1951.80792 7021.060
## 2012.307 4486.434 2753.082 6219.786 1835.50102 7137.367
## 2012.327 4486.434 2680.232 6292.636 1724.08682 7248.781
## 2012.346 4486.434 2610.209 6362.659 1616.99534 7355.873
## 2012.365 4486.434 2542.707 6430.162 1513.75936 7459.109
## 2012.384 4486.434 2477.471 6495.397 1413.99021 7558.878
## 2012.404 4486.434 2414.288 6558.580 1317.36046 7655.508
## 2012.423 4486.434 2352.976 6619.892 1223.59116 7749.277
## 2012.442 4486.434 2293.377 6679.491 1132.44240 7840.426
## 2012.461 4486.434 2235.355 6737.513 1043.70602 7929.162
## 2012.481 4486.434 2178.792 6794.076 957.20005 8015.668
## 2012.500 4486.434 2123.582 6849.286 872.76432 8100.104
## 2012.519 4486.434 2069.634 6903.234 790.25694 8182.611
## 2012.538 4486.434 2016.863 6956.005 709.55154 8263.317
## 2012.557 4486.434 1965.197 7007.671 630.53496 8342.333
## 2012.577 4486.434 1914.569 7058.299 553.10542 8419.763
## 2012.596 4486.434 1864.918 7107.950 477.17097 8495.697
## 2012.615 4486.434 1816.190 7156.678 402.64822 8570.220
## 2012.634 4486.434 1768.336 7204.532 329.46123 8643.407
## 2012.654 4486.434 1721.309 7251.559 257.54066 8715.327
## 2012.673 4486.434 1675.069 7297.799 186.82295 8786.045
## 2012.692 4486.434 1629.578 7343.290 117.24970 8855.618
## 2012.711 4486.434 1584.800 7388.068 48.76707 8924.101
## 2012.731 4486.434 1540.702 7432.166 -18.67466 8991.543
## 2012.750 4486.434 1497.254 7475.614 -85.12157 9057.990
## 2012.769 4486.434 1454.430 7518.438 -150.61643 9123.484
## 2012.788 4486.434 1412.201 7560.667 -215.19901 9188.067
## 2012.807 4486.434 1370.545 7602.323 -278.90641 9251.774
## 2012.827 4486.434 1329.439 7643.429 -341.77328 9314.641
# Make prediction on the validate data for simple ES
simple_es_pred <- model_forecasts(train_data, validate_set, simple_es)
## 15.34 sec elapsed
# Calculate the WMAE on the simple ES validate data
WMAE(simple_es_pred)
## [1] 3114.852
# Create a plot to display simple ES
forecast_plots(basic_ts,
list(
simple_es_mod
),
c(
"Simple ES"
)
)
Summary:
# Using the forecast function to generate weekly sales projection
holt_trend <- function(current_ts, h){
holt(current_ts, h = h)$mean
}
# Produce the summary of the Holt's Trend
holt_trend_mod <- holt(basic_train_mod, h = 36, seasonal = "multiplicative")
summary(holt_trend_mod)
##
## Forecast method: Holt's method
##
## Model Information:
## Holt's method
##
## Call:
## holt(y = basic_train_mod, h = 36, seasonal = "multiplicative")
##
## Smoothing parameters:
## alpha = 0.5228
## beta = 1e-04
##
## Initial states:
## l = 3664.0851
## b = 7.5552
##
## sigma: 764.4764
##
## AIC AICc BIC
## 1926.703 1927.297 1940.067
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.4167589 750.051 560.0883 -2.949682 15.26321 0.8883504 0.08485551
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2012.154 4500.210 3520.494 5479.926 3001.863749 5998.556
## 2012.173 4507.770 3402.215 5613.325 2816.969365 6198.570
## 2012.192 4515.329 3296.822 5733.837 2651.783078 6378.876
## 2012.211 4522.889 3201.008 5844.770 2501.246478 6544.531
## 2012.231 4530.449 3112.676 5948.221 2362.152740 6698.744
## 2012.250 4538.008 3030.398 6045.619 2232.317350 6843.699
## 2012.269 4545.568 2953.149 6137.987 2110.172841 6980.963
## 2012.288 4553.128 2880.163 6226.092 1994.549296 7111.706
## 2012.307 4560.687 2810.853 6310.522 1884.546071 7236.828
## 2012.327 4568.247 2744.752 6391.742 1779.452287 7357.041
## 2012.346 4575.807 2681.487 6470.126 1678.695171 7472.918
## 2012.365 4583.366 2620.751 6545.981 1581.805192 7584.927
## 2012.384 4590.926 2562.288 6619.564 1488.391744 7693.460
## 2012.404 4598.485 2505.883 6691.088 1398.125744 7798.845
## 2012.423 4606.045 2451.352 6760.738 1310.726866 7901.363
## 2012.442 4613.605 2398.539 6828.671 1225.953991 8001.255
## 2012.461 4621.164 2347.306 6895.023 1143.597939 8098.731
## 2012.481 4628.724 2297.534 6959.915 1063.475841 8193.972
## 2012.500 4636.284 2249.117 7023.451 985.426731 8287.141
## 2012.519 4643.843 2201.962 7085.725 909.308039 8378.379
## 2012.538 4651.403 2155.986 7146.819 834.992778 8467.813
## 2012.557 4658.963 2111.116 7206.809 762.367262 8555.558
## 2012.577 4666.522 2067.283 7265.761 691.329228 8641.715
## 2012.596 4674.082 2024.428 7323.736 621.786300 8726.378
## 2012.615 4681.642 1982.496 7380.787 553.654693 8809.628
## 2012.634 4689.201 1941.437 7436.966 486.858141 8891.544
## 2012.654 4696.761 1901.205 7492.317 421.326983 8972.195
## 2012.673 4704.321 1861.759 7546.882 356.997388 9051.644
## 2012.692 4711.880 1823.060 7600.701 293.810696 9129.950
## 2012.711 4719.440 1785.073 7653.807 231.712851 9207.167
## 2012.731 4726.999 1747.765 7706.234 170.653908 9283.345
## 2012.750 4734.559 1711.107 7758.012 110.587612 9358.531
## 2012.769 4742.119 1675.069 7809.168 51.471024 9432.767
## 2012.788 4749.678 1639.626 7859.731 -6.735802 9506.093
## 2012.807 4757.238 1604.754 7909.722 -64.070098 9578.546
## 2012.827 4764.798 1570.430 7959.166 -120.566641 9650.162
# Make prediction on the validate data for Holt's Trend
holt_trend_pred <- model_forecasts(train_data, validate_set, holt_trend)
## 24.92 sec elapsed
# Calculate the WMAE on the Holt's Trend validate data
WMAE(holt_trend_pred)
## [1] 4913.859
# Create a plot to display Holt's Trend
forecast_plots(basic_ts,
list(
holt_trend_mod
),
c(
"Holt's Trend"
)
)
Summary:
# Using the forecast function to generate weekly sales projection
seasonal_snaive <- function(current_ts, h){
snaive(current_ts, h = h)$mean
}
# Produce the summary of the seasonal naive
snaive_mod <- snaive(basic_train_mod, 36)
summary(snaive_mod)
##
## Forecast method: Seasonal naive method
##
## Model Information:
## Call: snaive(y = basic_train_mod, h = 36)
##
## Residual sd: 770.7283
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 153.2638 770.7283 630.4813 3.367486 15.9803 1 0.227437
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2012.154 3868.82 2881.092 4856.548 2358.22 5379.42
## 2012.173 4425.76 3438.032 5413.488 2915.16 5936.36
## 2012.192 4054.91 3067.182 5042.638 2544.31 5565.51
## 2012.211 5383.70 4395.972 6371.428 3873.10 6894.30
## 2012.231 3946.91 2959.182 4934.638 2436.31 5457.51
## 2012.250 4092.91 3105.182 5080.638 2582.31 5603.51
## 2012.269 4625.38 3637.652 5613.108 3114.78 6135.98
## 2012.288 4371.79 3384.062 5359.518 2861.19 5882.39
## 2012.307 4984.82 3997.092 5972.548 3474.22 6495.42
## 2012.327 4780.88 3793.152 5768.608 3270.28 6291.48
## 2012.346 4549.64 3561.912 5537.368 3039.04 6060.24
## 2012.365 4703.73 3716.002 5691.458 3193.13 6214.33
## 2012.384 3854.91 2867.182 4842.638 2344.31 5365.51
## 2012.404 3515.91 2528.182 4503.638 2005.31 5026.51
## 2012.423 4412.82 3425.092 5400.548 2902.22 5923.42
## 2012.442 4293.91 3306.182 5281.638 2783.31 5804.51
## 2012.461 4675.58 3687.852 5663.308 3164.98 6186.18
## 2012.481 3332.88 2345.152 4320.608 1822.28 4843.48
## 2012.500 4494.38 3506.652 5482.108 2983.78 6004.98
## 2012.519 4041.10 3053.372 5028.828 2530.50 5551.70
## 2012.538 3082.50 2094.772 4070.228 1571.90 4593.10
## 2012.557 3656.93 2669.202 4644.658 2146.33 5167.53
## 2012.577 3327.76 2340.032 4315.488 1817.16 4838.36
## 2012.596 3684.17 2696.442 4671.898 2173.57 5194.77
## 2012.615 4019.59 3031.862 5007.318 2508.99 5530.19
## 2012.634 3368.93 2381.202 4356.658 1858.33 4879.53
## 2012.654 4324.61 3336.882 5312.338 2814.01 5835.21
## 2012.673 4007.97 3020.242 4995.698 2497.37 5518.57
## 2012.692 3289.04 2301.312 4276.768 1778.44 4799.64
## 2012.711 3355.03 2367.302 4342.758 1844.43 4865.63
## 2012.731 3391.13 2403.402 4378.858 1880.53 4901.73
## 2012.750 3294.28 2306.552 4282.008 1783.68 4804.88
## 2012.769 3958.71 2970.982 4946.438 2448.11 5469.31
## 2012.788 3185.88 2198.152 4173.608 1675.28 4696.48
## 2012.807 3797.81 2810.082 4785.538 2287.21 5308.41
## 2012.827 3611.89 2624.162 4599.618 2101.29 5122.49
# Make prediction on the validate data for SNAIVE
snaive_pred <- model_forecasts(train_data, validate_set, seasonal_snaive)
## 6.67 sec elapsed
# Calculate the WMAE on the SNAIVE validate data
WMAE(snaive_pred)
## [1] 1603.568
# Create a plot to display snaive
forecast_plots(basic_ts,
list(
snaive_mod
),
c(
"Seasonal Naive"
)
)
Summary:
# Using the forecast function to generate weekly sales projection
tslm_forecast <- function(current_ts, h){
tslm(current_ts ~ trend + season) %>%
forecast( h = h) %>%
.$mean
}
# Produce the summary of the tslm
tslm_mod <- tslm(basic_train_mod ~ trend + season) %>% forecast(h = 36)
summary(tslm_mod)
##
## Forecast method: Linear regression model
##
## Model Information:
##
## Call:
## tslm(formula = basic_train_mod ~ trend + season)
##
## Coefficients:
## (Intercept) trend season2 season3 season4 season5
## 2837.911 3.730 -489.425 -620.535 -310.324 176.666
## season6 season7 season8 season9 season10 season11
## 800.054 802.874 1378.051 767.610 1719.346 1288.191
## season12 season13 season14 season15 season16 season17
## 1825.356 808.731 1627.001 1602.007 1434.732 1637.267
## season18 season19 season20 season21 season22 season23
## 1685.567 2069.967 1758.033 1035.893 1261.663 1862.888
## season24 season25 season26 season27 season28 season29
## 1568.204 1526.809 814.729 1388.249 851.379 373.350
## season30 season31 season32 season33 season34 season35
## 593.835 386.770 676.995 484.976 358.416 637.526
## season36 season37 season38 season39 season40 season41
## 296.476 -348.719 121.297 -178.633 6.712 524.697
## season42 season43 season44 season45 season46 season47
## -46.447 51.788 364.598 1105.963 485.848 1002.449
## season48 season49 season50 season51 season52
## 1067.219 951.529 1283.869 1942.170 3136.860
##
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -3.82731e-14 387.511 306.1423 -1.102105 8.192261 0.4855692
## ACF1
## Training set 0.2318816
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2012.154 4008.338 3120.502 4896.173 2636.389 5380.287
## 2012.173 4963.803 4075.967 5851.638 3591.854 6335.752
## 2012.192 4536.378 3648.542 5424.213 3164.429 5908.327
## 2012.211 5077.273 4189.437 5965.108 3705.324 6449.222
## 2012.231 4064.378 3176.542 4952.213 2692.429 5436.327
## 2012.250 4886.378 3998.542 5774.213 3514.429 6258.327
## 2012.269 4865.113 3977.277 5752.948 3493.164 6237.062
## 2012.288 4701.568 3813.732 5589.403 3329.619 6073.517
## 2012.307 4907.833 4019.997 5795.668 3535.884 6279.782
## 2012.327 4959.863 4072.027 5847.698 3587.914 6331.812
## 2012.346 5347.993 4460.157 6235.828 3976.044 6719.942
## 2012.365 5039.788 4151.952 5927.623 3667.839 6411.737
## 2012.384 4321.378 3433.542 5209.213 2949.429 5693.327
## 2012.404 4550.878 3663.042 5438.713 3178.929 5922.827
## 2012.423 5155.833 4267.997 6043.668 3783.884 6527.782
## 2012.442 4864.878 3977.042 5752.713 3492.929 6236.827
## 2012.461 4827.213 3939.377 5715.048 3455.264 6199.162
## 2012.481 4118.863 3231.027 5006.698 2746.914 5490.812
## 2012.500 4696.113 3808.277 5583.948 3324.164 6068.062
## 2012.519 4162.973 3275.137 5050.808 2791.024 5534.922
## 2012.538 3688.673 2800.837 4576.508 2316.724 5060.622
## 2012.557 3912.888 3025.052 4800.723 2540.939 5284.837
## 2012.577 3709.553 2821.717 4597.388 2337.604 5081.502
## 2012.596 4003.508 3115.672 4891.343 2631.559 5375.457
## 2012.615 3815.218 2927.382 4703.053 2443.269 5187.167
## 2012.634 3692.388 2804.552 4580.223 2320.439 5064.337
## 2012.654 3975.228 3087.392 4863.063 2603.279 5347.177
## 2012.673 3637.908 2750.072 4525.743 2265.959 5009.857
## 2012.692 2996.443 2108.607 3884.278 1624.494 4368.392
## 2012.711 3470.188 2582.352 4358.023 2098.239 4842.137
## 2012.731 3173.988 2286.152 4061.823 1802.039 4545.937
## 2012.750 3363.063 2475.227 4250.898 1991.114 4735.012
## 2012.769 3884.778 2996.942 4772.613 2512.829 5256.727
## 2012.788 3317.363 2429.527 4205.198 1945.414 4689.312
## 2012.807 3419.328 2531.492 4307.163 2047.379 4791.277
## 2012.827 3735.868 2848.032 4623.703 2363.919 5107.817
# Make prediction on the validate data for tslm
tslm_pred <- model_forecasts(train_data, validate_set, tslm_forecast)
## 36.84 sec elapsed
# Calculate the WMAE on the tslm validate data
WMAE(tslm_pred)
## [1] 1482.701
# Create a plot to display linear model with ts components
forecast_plots(basic_ts,
list(
tslm_mod
),
c(
"TSLM Forecast"
)
)
Summary:
# Using the forecast function to generate weekly sales projection
tbats_fc <- function(current_ts, h){
forecast::forecast(current_ts, h = h)$mean
}
# Produce the summary of the TBATS
tbats_mod <- forecast::forecast(basic_train_mod, h = 36)
summary(tbats_mod)
##
## Forecast method: STL + ETS(A,N,N)
##
## Model Information:
## ETS(A,N,N)
##
## Call:
## ets(y = na.interp(x), model = etsmodel, allow.multiplicative.trend = allow.multiplicative.trend)
##
## Smoothing parameters:
## alpha = 0.2068
##
## Initial states:
## l = 3663.313
##
## sigma: 388.7651
##
## AIC AICc BIC
## 1780.050 1780.283 1788.069
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 26.69818 385.1146 305.4423 -0.2866351 8.043471 0.4844589
## ACF1
## Training set 0.06771511
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2012.154 4127.258 3629.036 4625.481 3365.293 4889.224
## 2012.173 5075.838 4567.073 5584.603 4297.750 5853.927
## 2012.192 4647.011 4127.918 5166.104 3853.126 5440.895
## 2012.211 5195.314 4666.094 5724.534 4385.942 6004.686
## 2012.231 4175.228 3636.072 4714.385 3350.659 4999.797
## 2012.250 4987.000 4438.087 5535.913 4147.510 5826.491
## 2012.269 4969.558 4411.059 5528.058 4115.407 5823.710
## 2012.288 4802.081 4234.156 5370.005 3933.515 5670.646
## 2012.307 5010.285 4433.090 5587.480 4127.542 5893.029
## 2012.327 5056.271 4469.952 5642.590 4159.573 5952.968
## 2012.346 5433.978 4838.674 6029.281 4523.540 6344.416
## 2012.365 5128.382 4524.227 5732.536 4204.408 6052.356
## 2012.384 4405.441 3792.564 5018.318 3468.127 5342.756
## 2012.404 4625.133 4003.656 5246.611 3674.665 5575.601
## 2012.423 5230.646 4600.686 5860.607 4267.205 6194.088
## 2012.442 4938.805 4300.474 5577.136 3962.562 5915.048
## 2012.461 4903.234 4256.641 5549.827 3914.356 5892.113
## 2012.481 4184.284 3529.534 4839.035 3182.929 5185.639
## 2012.500 4765.415 4102.607 5428.223 3751.738 5779.093
## 2012.519 4230.080 3559.311 4900.848 3204.227 5255.932
## 2012.538 3746.787 3068.151 4425.423 2708.903 4784.671
## 2012.557 3972.063 3285.651 4658.476 2922.285 5021.842
## 2012.577 3764.055 3069.952 4458.157 2702.516 4825.593
## 2012.596 4055.604 3353.896 4757.313 2982.434 5128.775
## 2012.615 3870.466 3161.233 4579.698 2785.788 4955.143
## 2012.634 3738.119 3021.441 4454.796 2642.055 4834.183
## 2012.654 4025.748 3301.702 4749.794 2918.415 5133.081
## 2012.673 3685.360 2954.020 4416.701 2566.871 4803.849
## 2012.692 3039.877 2301.314 3778.440 1910.342 4169.411
## 2012.711 3505.624 2759.909 4251.339 2365.151 4646.097
## 2012.731 3210.342 2457.542 3963.141 2059.034 4361.649
## 2012.750 3392.886 2633.068 4152.704 2230.845 4554.928
## 2012.769 3913.235 3146.463 4680.007 2740.558 5085.912
## 2012.788 3340.260 2566.596 4113.923 2157.043 4523.476
## 2012.807 3445.283 2664.789 4225.777 2251.620 4638.946
## 2012.827 3752.684 2965.418 4539.950 2548.665 4956.703
# Make prediction on the validate data for TBATS
tbats_pred <- model_forecasts(train_data, validate_set, tbats_fc)
## 129.38 sec elapsed
# Calculate the WMAE on the TBATS validate data
WMAE(tbats_pred)
## [1] 1354.773
# Create a plot to display ES state space model with Box-Cox transformation, ARMA errors, Trend and Seasonal components
forecast_plots(basic_ts,
list(
tbats_mod
),
c(
"TBATS"
)
)
Summary:
# Using the forecast function to generate weekly sales projection
stl_ets <- function(current_ts, h){
stlf(current_ts, method = "ets", opt.crit = 'mae', h = h)$mean
}
# Produce the summary of the TBATS
stl_ets_mod <- stlf(basic_train_mod, method = "ets", 36)
summary(stl_ets_mod)
##
## Forecast method: STL + ETS(A,N,N)
##
## Model Information:
## ETS(A,N,N)
##
## Call:
## ets(y = na.interp(x), model = etsmodel, allow.multiplicative.trend = allow.multiplicative.trend)
##
## Smoothing parameters:
## alpha = 0.2068
##
## Initial states:
## l = 3663.313
##
## sigma: 388.7651
##
## AIC AICc BIC
## 1780.050 1780.283 1788.069
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 26.69818 385.1146 305.4423 -0.2866351 8.043471 0.4844589
## ACF1
## Training set 0.06771511
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2012.154 4127.258 3629.036 4625.481 3365.293 4889.224
## 2012.173 5075.838 4567.073 5584.603 4297.750 5853.927
## 2012.192 4647.011 4127.918 5166.104 3853.126 5440.895
## 2012.211 5195.314 4666.094 5724.534 4385.942 6004.686
## 2012.231 4175.228 3636.072 4714.385 3350.659 4999.797
## 2012.250 4987.000 4438.087 5535.913 4147.510 5826.491
## 2012.269 4969.558 4411.059 5528.058 4115.407 5823.710
## 2012.288 4802.081 4234.156 5370.005 3933.515 5670.646
## 2012.307 5010.285 4433.090 5587.480 4127.542 5893.029
## 2012.327 5056.271 4469.952 5642.590 4159.573 5952.968
## 2012.346 5433.978 4838.674 6029.281 4523.540 6344.416
## 2012.365 5128.382 4524.227 5732.536 4204.408 6052.356
## 2012.384 4405.441 3792.564 5018.318 3468.127 5342.756
## 2012.404 4625.133 4003.656 5246.611 3674.665 5575.601
## 2012.423 5230.646 4600.686 5860.607 4267.205 6194.088
## 2012.442 4938.805 4300.474 5577.136 3962.562 5915.048
## 2012.461 4903.234 4256.641 5549.827 3914.356 5892.113
## 2012.481 4184.284 3529.534 4839.035 3182.929 5185.639
## 2012.500 4765.415 4102.607 5428.223 3751.738 5779.093
## 2012.519 4230.080 3559.311 4900.848 3204.227 5255.932
## 2012.538 3746.787 3068.151 4425.423 2708.903 4784.671
## 2012.557 3972.063 3285.651 4658.476 2922.285 5021.842
## 2012.577 3764.055 3069.952 4458.157 2702.516 4825.593
## 2012.596 4055.604 3353.896 4757.313 2982.434 5128.775
## 2012.615 3870.466 3161.233 4579.698 2785.788 4955.143
## 2012.634 3738.119 3021.441 4454.796 2642.055 4834.183
## 2012.654 4025.748 3301.702 4749.794 2918.415 5133.081
## 2012.673 3685.360 2954.020 4416.701 2566.871 4803.849
## 2012.692 3039.877 2301.314 3778.440 1910.342 4169.411
## 2012.711 3505.624 2759.909 4251.339 2365.151 4646.097
## 2012.731 3210.342 2457.542 3963.141 2059.034 4361.649
## 2012.750 3392.886 2633.068 4152.704 2230.845 4554.928
## 2012.769 3913.235 3146.463 4680.007 2740.558 5085.912
## 2012.788 3340.260 2566.596 4113.923 2157.043 4523.476
## 2012.807 3445.283 2664.789 4225.777 2251.620 4638.946
## 2012.827 3752.684 2965.418 4539.950 2548.665 4956.703
# Make prediction on the validate data for TBATS
stl_ets_pred <- model_forecasts(train_data, validate_set, stl_ets)
## 146.01 sec elapsed
# Calculate the WMAE on the TBATS validate data
WMAE(stl_ets_pred)
## [1] 1362.378
# Create a plot to display ES state space model with Box-Cox transformation, ARMA errors, Trend and Seasonal components
forecast_plots(basic_ts,
list(
stl_ets_mod
),
c(
"STL_ETS"
)
)
Summary:
# Using the forecast function to generate weekly sales projection
stl_arima <- function(current_ts, h){
stlf(current_ts, method = "arima", h = h)$mean
}
# Produce the summary of the TBATS
stl_arima_mod <- stlf(basic_train_mod, method = "ets", 36)
summary(stl_arima_mod)
##
## Forecast method: STL + ETS(A,N,N)
##
## Model Information:
## ETS(A,N,N)
##
## Call:
## ets(y = na.interp(x), model = etsmodel, allow.multiplicative.trend = allow.multiplicative.trend)
##
## Smoothing parameters:
## alpha = 0.2068
##
## Initial states:
## l = 3663.313
##
## sigma: 388.7651
##
## AIC AICc BIC
## 1780.050 1780.283 1788.069
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 26.69818 385.1146 305.4423 -0.2866351 8.043471 0.4844589
## ACF1
## Training set 0.06771511
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2012.154 4127.258 3629.036 4625.481 3365.293 4889.224
## 2012.173 5075.838 4567.073 5584.603 4297.750 5853.927
## 2012.192 4647.011 4127.918 5166.104 3853.126 5440.895
## 2012.211 5195.314 4666.094 5724.534 4385.942 6004.686
## 2012.231 4175.228 3636.072 4714.385 3350.659 4999.797
## 2012.250 4987.000 4438.087 5535.913 4147.510 5826.491
## 2012.269 4969.558 4411.059 5528.058 4115.407 5823.710
## 2012.288 4802.081 4234.156 5370.005 3933.515 5670.646
## 2012.307 5010.285 4433.090 5587.480 4127.542 5893.029
## 2012.327 5056.271 4469.952 5642.590 4159.573 5952.968
## 2012.346 5433.978 4838.674 6029.281 4523.540 6344.416
## 2012.365 5128.382 4524.227 5732.536 4204.408 6052.356
## 2012.384 4405.441 3792.564 5018.318 3468.127 5342.756
## 2012.404 4625.133 4003.656 5246.611 3674.665 5575.601
## 2012.423 5230.646 4600.686 5860.607 4267.205 6194.088
## 2012.442 4938.805 4300.474 5577.136 3962.562 5915.048
## 2012.461 4903.234 4256.641 5549.827 3914.356 5892.113
## 2012.481 4184.284 3529.534 4839.035 3182.929 5185.639
## 2012.500 4765.415 4102.607 5428.223 3751.738 5779.093
## 2012.519 4230.080 3559.311 4900.848 3204.227 5255.932
## 2012.538 3746.787 3068.151 4425.423 2708.903 4784.671
## 2012.557 3972.063 3285.651 4658.476 2922.285 5021.842
## 2012.577 3764.055 3069.952 4458.157 2702.516 4825.593
## 2012.596 4055.604 3353.896 4757.313 2982.434 5128.775
## 2012.615 3870.466 3161.233 4579.698 2785.788 4955.143
## 2012.634 3738.119 3021.441 4454.796 2642.055 4834.183
## 2012.654 4025.748 3301.702 4749.794 2918.415 5133.081
## 2012.673 3685.360 2954.020 4416.701 2566.871 4803.849
## 2012.692 3039.877 2301.314 3778.440 1910.342 4169.411
## 2012.711 3505.624 2759.909 4251.339 2365.151 4646.097
## 2012.731 3210.342 2457.542 3963.141 2059.034 4361.649
## 2012.750 3392.886 2633.068 4152.704 2230.845 4554.928
## 2012.769 3913.235 3146.463 4680.007 2740.558 5085.912
## 2012.788 3340.260 2566.596 4113.923 2157.043 4523.476
## 2012.807 3445.283 2664.789 4225.777 2251.620 4638.946
## 2012.827 3752.684 2965.418 4539.950 2548.665 4956.703
# Make prediction on the validate data for TBATS
stl_arima_pred <- model_forecasts(train_data, validate_set, stl_arima)
## 381.97 sec elapsed
# Calculate the WMAE on the TBATS validate data
WMAE(stl_arima_pred)
## [1] 1324.513
# Create a plot to display ES state space model with Box-Cox transformation, ARMA errors, Trend and Seasonal components
forecast_plots(basic_ts,
list(
stl_arima_mod
),
c(
"STL_ARIMA"
)
)
Summary:
# Define a variable to save the final forecast model based on the lowest WMAE
final_stl_arima_mod <- model_forecasts(store_dept_mts, nums_wks_test, stl_arima)
## 396.7 sec elapsed
# Shift the holidays
adjust_holidays <- function(full_forecast){
adjustment <- function(fc){
if(2 * fc[9] < fc[8]){
adj <- fc[8] * (2.5 / 7)
fc[9] <- fc[9] + adj
fc[8] <- fc[8] - adj
}
fc
}
apply(full_forecast, 2, adjustment)
}
final_forecast <- adjust_holidays(final_stl_arima_mod)
# # Make the observations 0 for stores that don't have historical observations
store_dept_names <- colnames(store_dept_mts)
colnames(final_forecast) <- store_dept_names
# Create a tibble
test_dates <- tibble("Date" = seq(begin_test, end_test, by = 7))
final <-
cbind(test_dates, final_forecast) %>%
pivot_longer(!Date, names_to = "Store_Dept", values_to = "Weekly_Sales")
Summary:
# Write the projected weekly sales to csv
final_wkly_sales <-
test_forecast %>%
left_join(final, by = c("Store_Dept", "Date")) %>%
replace_na(list(Weekly_Sales = 0)) %>%
mutate(Id = paste0(Store_Dept, "_", Date)) %>%
dplyr::select(Id, Weekly_Sales)
final_wkly_sales
## # A tibble: 115,064 x 2
## Id Weekly_Sales
## <chr> <dbl>
## 1 1_1_2012-11-02 34210.
## 2 1_1_2012-11-09 19124.
## 3 1_1_2012-11-16 19305.
## 4 1_1_2012-11-23 19881.
## 5 1_1_2012-11-30 23921.
## 6 1_1_2012-12-07 32408.
## 7 1_1_2012-12-14 45340.
## 8 1_1_2012-12-21 32974.
## 9 1_1_2012-12-28 39568.
## 10 1_1_2013-01-04 16261.
## # ... with 115,054 more rows
write_csv(final_wkly_sales, "D:\\Projects\\walmart_weekly_sales_projection.csv")